MS5130 A3 Clothing_Product_Analysis

Author

Manav

Product analysis using Dataset 1

# Loading all the required libraries

library(ggplot2) # For visualization
library(dplyr) # For data manipulation

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyr) # For tidying data
library(stringr) # For string manipulation
library(readr) # For reading CSV files
library(tidytext) 
Warning: package 'tidytext' was built under R version 4.3.3
library(plotly)
Warning: package 'plotly' was built under R version 4.3.3

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(wordcloud)
Warning: package 'wordcloud' was built under R version 4.3.3
Loading required package: RColorBrewer
library(quanteda)
Warning: package 'quanteda' was built under R version 4.3.3
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated
Package version: 3.3.1
Unicode version: 15.1
ICU version: 74.1
Parallel computing: 12 of 12 threads used.
See https://quanteda.io for tutorials and examples.
library(leaflet)
Warning: package 'leaflet' was built under R version 4.3.3
library(tidygeocoder)
Warning: package 'tidygeocoder' was built under R version 4.3.3
# Reading the CSV File
product_catalog <- read_csv("myntra_products_catalog.csv")
Rows: 12491 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): ProductName, ProductBrand, Gender, Description, PrimaryColor
dbl (3): ProductID, Price (INR), NumImages

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(product_catalog)
[1] 12491
# Pre-processing data
product_catalog <- product_catalog %>%
  # Renaming columns named weirdly
  rename(PriceINR = `Price (INR)`) %>%
  # Removing rows with blank entries or NAs in any column
  ####filter(apply(., 1, function(row) all(row != "" & !is.na(row)))) %>%
  drop_na() %>%
  # Trimming leading and trailing whitespaces in all character columns
  # Adapted from stackoverflow: https://stackoverflow.com/questions/20760547/removing-whitespace-from-a-whole-data-frame-in-r
  mutate(across(where(is.character), trimws))

head(product_catalog)
# A tibble: 6 × 8
  ProductID ProductName       ProductBrand Gender PriceINR NumImages Description
      <dbl> <chr>             <chr>        <chr>     <dbl>     <dbl> <chr>      
1  10017413 DKNY Unisex Blac… DKNY         Unisex    11745         7 Black and …
2  10016283 EthnoVogue Women… EthnoVogue   Women      5810         7 Beige & Gr…
3  10009781 SPYKAR Women Pin… SPYKAR       Women       899         7 Pink colou…
4  10015921 Raymond Men Blue… Raymond      Men        5599         5 Blue self-…
5  10017833 Parx Men Brown &… Parx         Men         759         5 Brown and …
6  10014361 SHOWOFF Men Brow… SHOWOFF      Men         791         5 Brown soli…
# ℹ 1 more variable: PrimaryColor <chr>

Expensive and budget-friendly brands

# Calculating the average price for each brand
brand_prices <- product_catalog %>%
  group_by(ProductBrand) %>%
  # Adapted from stackoverflow : https://stackoverflow.com/questions/25759891/dplyr-summarise-each-with-na-rm
  summarise(AveragePrice = mean(PriceINR, na.rm = TRUE),NumProducts = n()) %>%
  filter(NumProducts > 10) %>%
  ungroup() %>%
  arrange(desc(AveragePrice))

# Identifying the top 3 most expensive brands and cheap brands
top_expensive_brands <- head(brand_prices, 3)

top_cheap_brands <- tail(brand_prices, 3)

# Displaying the results
print("Top 3 Most Expensive Brands: ") 
[1] "Top 3 Most Expensive Brands: "
print(top_expensive_brands)
# A tibble: 3 × 3
  ProductBrand AveragePrice NumProducts
  <chr>               <dbl>       <int>
1 SEIKO              18259.          17
2 DKNY               16142.          26
3 PRESTO             10981.          26
print("Top 3 Budget Friendly Brands: ")
[1] "Top 3 Budget Friendly Brands: "
print(top_cheap_brands)
# A tibble: 3 × 3
  ProductBrand      AveragePrice NumProducts
  <chr>                    <dbl>       <int>
1 Urban Dog                 420.          14
2 Alvaro Castagnino         419.          11
3 FabSeasons                373.          15

Gender wise product distribution

# Getting product counts grouped by gender
gender_counts <- product_catalog %>%
  group_by(Gender) %>%
  summarise(Count = n()) %>%
  ungroup()

# Creating a bar graph for the number of products by gender
ggplot(gender_counts, aes(x = Gender, y = Count, fill = Gender, label = Count)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Gender wise product distribution",
       x = "Gender",
       y = "Number of Products") +
  scale_fill_brewer(palette = "Set1") +
  geom_text(position = position_stack(vjust = 0.5))

Color wise product distribution

# Getting product counts for each color
color_counts <- product_catalog %>%
  group_by(PrimaryColor) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count))
nrow(color_counts)
[1] 27
# Identifying top 5 colors
top_colors <- head(color_counts, 5)


# Putting the rest of the colors in "Others" category
others_count <- sum(tail(color_counts, n = nrow(color_counts) - 5)$Count)
others_row <- data.frame(PrimaryColor = "Others", Count = others_count)

# Combining the top colors with "Others"
final_data <- rbind(top_colors, others_row)

final_data
# A tibble: 6 × 2
  PrimaryColor Count
  <chr>        <int>
1 Blue          3443
2 Black         1640
3 Red           1543
4 Green          908
5 White          880
6 Others        3183
# setting color styles manually
# Adapted from open ai : 'Prompt - Color codes for the below colors'
named_colors <- c(
  "Black" = "#808080",  # Lighter grey instead of black
  "Blue" = "#ADD8E6",   # Light Blue
  "Red" = "#FF9999",    # Light Red
  "Green" = "#90EE90",  # Light Green
  "White" = "#F8F8FF",  # Off-White, Ghost White
  "Others" = "#D8BFD8"  # Light Purple (Thistle)
)

# Changing the 'final_data' values as percentage of whole count
final_data$Percentage <- (final_data$Count / sum(final_data$Count)) * 100

ggplot(final_data, aes(x = "", y = Count, fill = PrimaryColor, label = sprintf("%1.1f%%", Percentage))) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar(theta = "y") +
  theme_void() +
  labs(title = "Color Based Product Distribution") +
  scale_fill_manual(values = named_colors) +
  geom_text(position = position_stack(vjust = 0.5))

# blue, black and red are the most popular color categories

Sentiment and Review analysis using Dataset 2 and 3

# Reading the other 2 datasets (Detailed Product dataset and Product Review dataset)

Product_Detailed_2 <- read_csv("Product_Detailed_2.csv")
Rows: 526564 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (8): URL, BrandName, Category, Individual_category, category_by_Gender, ...
dbl (5): Product_id, DiscountPrice (in Rs), OriginalPrice (in Rs), Ratings, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Product_Reviews_3 <- read_csv("Product_Reviews_3.csv")
Rows: 23486 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Title, Review Text, Division Name, Category, Class Name
dbl (6): index, Clothing ID, Age, Rating, Recommended IND, Positive Feedback...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Manually creating the mapping based on domain knowledge so that the Categories from both the dataset can be matched to each other

Product_Detailed_2 <- Product_Detailed_2 %>%
  mutate(Category = case_when(
    Category == "Inner Wear & Sleep Wear" ~ "Intimate",
    Category == "Lingerie & Sleep Wear" ~ "Intimate",
    Category == "Western" ~ "Dresses",
    Category == "Bottom Wear" ~ "Bottoms",
    Category == "Topwear" ~ "Tops",
    Category == "Sports Wear" ~ "Jackets",
    Category == "Indian Wear" ~ "Trend",
    TRUE ~ as.character(Category)  # Default case to keep original Category values
  ))

Sentiment scores and average ratings

# Average rating using Detailed product dataset (Dataset 2)

avg_rating_by_category <- Product_Detailed_2 %>%
  group_by(Category) %>%
  summarise(AverageRating = mean(Ratings, na.rm = TRUE)) %>%
  ungroup()

# Sentiment scores using Product Review dataset (Dataset 3)

sentiment_analysis <- Product_Reviews_3 %>%
  unnest_tokens(word, 'Review Text') %>%
  inner_join(get_sentiments("bing"), by = "word", relationship = "many-to-many") %>%
  group_by(index) %>%
  summarise(sentiment_score = sum(case_when(
    sentiment == "positive" ~ 1,
    sentiment == "negative" ~ -1,
    TRUE ~ 0L
  )), .groups = 'drop')


# plotting sentiment score histogram

ggplot(sentiment_analysis, aes(x = sentiment_score)) +
  geom_histogram(bins = 50) +
  labs(title = "Distribution of Sentiment Scores", x = "Sentiment Score", y = "Count")

# Joining the sentiments back to the review dataset
reviewdataset_with_sentiments <- Product_Reviews_3 %>%
  left_join(sentiment_analysis, by = "index")

# Grouping by Category for analysis
avg_sentiments_by_category <- reviewdataset_with_sentiments %>%
  group_by(Category) %>%
  summarise(average_sentiment = mean(sentiment_score, na.rm = TRUE)) %>%
  ungroup()  

# Visualizing sentiment scores across categories
avg_sentiments_by_category <- na.omit(avg_sentiments_by_category)

p <- plot_ly(data = avg_sentiments_by_category, x = ~Category, y = ~average_sentiment, type = 'bar', marker = list(color = ~average_sentiment, colorscale = 'Viridis'), showlegend = FALSE) %>%
  layout(title = 'Distribution of Average Sentiment Scores by Category',
         xaxis = list(title = 'Category'),
         yaxis = list(title = 'Average Sentiment Score'))

p

The histogram shows a normal distribution for the sentiment scores, most scores cluster around the middle, suggesting that most sentiments expressed are neutral. There are fewer very positive or very negative sentiments, as seen by the lower bars at the ends.

This bar chart displays average sentiment scores for different clothing categories. Tops, Dresses, and Jackets seem to have higher average sentiments, indicating more positive feedback, while the ‘Trend’ category has a noticeably lower average sentiment score.

Insights using Sentiment score and ratings

# Joining the datasets on the 'Category' column and displaying the results
category_insights <- inner_join(avg_rating_by_category, avg_sentiments_by_category, by = "Category")

final_display <- category_insights %>%
  select(Category, AverageRating, average_sentiment)

print(final_display)
# A tibble: 6 × 3
  Category AverageRating average_sentiment
  <chr>            <dbl>             <dbl>
1 Bottoms           4.03              3.29
2 Dresses           4.10              3.21
3 Intimate          4.13              3.52
4 Jackets           4.20              3.23
5 Tops              4.13              3.58
6 Trend             4.01              2.64
# The bar chart
p <- plot_ly(data = final_display, x = ~Category) %>%
  add_bars(y = ~AverageRating, name = 'Average Rating', marker = list(color = '#ADD8E6')) %>%
  add_bars(y = ~average_sentiment, name = 'Average Sentiment', marker = list(color = '#FF9999')) %>%
  layout(yaxis = list(title = 'Score'),
         barmode = 'group',
         title = 'Comparison of Average Ratings and Sentiments by Category')

p

The bar chart shows comparison of average ratings and average sentiments across various categories such as “Bottoms,” “Dresses,” “Intimate,” and so on.

Insights from the chart:

Consistency: The average ratings and average sentiments are consistent across categories. This suggests that customers’ overall satisfaction levels are similar across different product types. Moreover the balance indicates that sentiment scores derived from reviews correlate well with the numerical ratings customers give. In other words, how customers speak about the product aligns with how they rate it.

Lack of Extremes and positiveness: There are no categories with extreme sentiment or rating scores. The positive ratings and sentiment values also indicate a moderately positive level of customer feeling towards the products in each categories.

Wordcloud of Category with high sentiment

# Filtering the dataset for the category "Tops"
dresses_reviews <- Product_Reviews_3 %>%
  filter(Category == "Tops") %>%
  select('Review Text')

# Creating a text corpus from the vector source of review text
text_corpus <- corpus(dresses_reviews$'Review Text')
Warning: NA is replaced by empty string
# Preprocessing to remove punctuations, numbers, spaces and common english words like 'the', 'at' since they are not relevant for analysis
cleaned_corpus <- text_corpus %>%
  tokens() %>%
  tokens_tolower() %>%
  tokens_remove(pattern = "[[:punct:]]") %>%
  tokens_remove(pattern = "\\d+") %>%
  tokens_remove(pattern = stopwords("en")) %>%
  tokens_remove(pattern = "[[:space:]]+")
# Adapted from: https://search.r-project.org/CRAN/refmans/quanteda/html/tokens_select.html

# Generate the word cloud
set.seed(1234)
wordcloud(words = cleaned_corpus, scale = c(3, 0.5), max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
Loading required namespace: tm

The word cloud for “tops” highlights key words that stand out in customer reviews. “Love,” “like,” and “fit” are prominent, indicating these are common sentiments that customers frequently mention. Words like “color,” “size,” and “fabric” suggest these are important factors for customers when evaluating tops. Overall, this word cloud reflects positive customer experiences.

Analysis of Brand Average Ratings in Relation to Pricing and Product Count

# Aggregating brand data using Dataset 1 - product_catalog

agg_product_data <- product_catalog %>%
  group_by(ProductBrand) %>%
  filter(n() > 50) %>%
  summarise(
    AvgPrice1 = mean(PriceINR, na.rm = TRUE),
    ProductCount = n()
  ) %>%
  ungroup()

# Aggregating brand data using Dataset 2 - Product_Detailed_2

agg_detailed_data <- Product_Detailed_2 %>%
  group_by(BrandName) %>%
  filter(n() > 50) %>%
  summarise(
    AvgPrice2 = mean(`OriginalPrice (in Rs)`, na.rm = TRUE),
    AvgRating = mean(Ratings, na.rm = TRUE)
  ) %>%
  ungroup()

combined_data <- merge(agg_product_data, agg_detailed_data, 
                       by.x = "ProductBrand", by.y = "BrandName")

head(combined_data)
        ProductBrand AvgPrice1 ProductCount AvgPrice2 AvgRating
1              Alcis  939.0000           65  1704.936  4.087042
2            AURELIA 1035.8586          304  1718.111  4.101044
3             Basics 1460.5663           83  2004.515  3.855172
4 Calvin Klein Jeans 3697.8462          130  6204.202  4.184255
5     Carlton London  703.0794           63  1806.791  4.105970
6            Chkokko  541.0000           59  2826.681  4.038087
# Building and summarizing a glm model

glm_model <- glm(AvgRating ~ AvgPrice1 + ProductCount, family = gaussian(), data = combined_data)

summary(glm_model)

Call:
glm(formula = AvgRating ~ AvgPrice1 + ProductCount, family = gaussian(), 
    data = combined_data)

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.065e+00  5.395e-02  75.345   <2e-16 ***
AvgPrice1    3.787e-05  3.311e-05   1.144    0.260    
ProductCount 1.234e-04  1.456e-04   0.847    0.403    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.02117982)

    Null deviance: 0.78344  on 37  degrees of freedom
Residual deviance: 0.74129  on 35  degrees of freedom
AIC: -33.765

Number of Fisher Scoring iterations: 2

A Generalized Linear Model (GLM) explores the relationships between the average price and the product count on the average rating for brands with more than 50 products.

The GLM analysis, focusing on the average price and the product count as predictors of average rating, reveals that neither average price nor product count significantly affects the average ratings in a statistically meaningful way.

# Adding predicted ratings
combined_data$predicted_ratings <- predict(glm_model, type = "response")

# Ploting the ratings data
ggplot(combined_data, aes(x = AvgRating, y = predicted_ratings)) +
  geom_point(aes(color = "Actual vs. Predicted"), alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Actual vs. Predicted Ratings",
       x = "Actual Ratings",
       y = "Predicted Ratings") +
  theme_minimal() +
  scale_color_manual(values = c("Actual vs. Predicted" = "blue"))

While the statistical model did not reveal significant predictors for average ratings, the visualization of actual versus predicted ratings is presented here to show the model’s power.

We can see that most points are clustered around the line, but there is a spread, indicating some variance between the predicted and actual ratings.

Average price changes over time

We are using Top 5 categories from the ‘brand_prices’ variable generated in ‘Expensive and budget-friendly brands’ section that uses product dataset (dataset 1) that has products from 2015 and also generating top 5 brands from the detailed dataset (dataset 2) that has products from 2020 for brands that have more than 10 products and then comparing the prices over time.

# Aggregating brand data using Dataset 1 - product_catalog
top5_expensive_brands_2015 <- head(brand_prices, 5) %>%
  mutate(Year = "2015") %>%
  select(Brand = ProductBrand, AvgPrice=AveragePrice, Year)

# Aggregating brand data using Dataset 2 - Product_Detailed_2

top5_expensive_brands_2020 <- Product_Detailed_2 %>%
  group_by(BrandName) %>%
  filter(n() > 10) %>%
  summarise(
    AvgPrice = mean(`OriginalPrice (in Rs)`, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  mutate(Year = "2020") %>%
  arrange(desc(AvgPrice)) %>%
  head(5) %>%
  select(Brand = BrandName, AvgPrice, Year)


# Binding the two while keeping the year distinction
top_brands_combined <- bind_rows(top5_expensive_brands_2015, top5_expensive_brands_2020)

top_brands_combined
# A tibble: 10 × 3
   Brand             AvgPrice Year 
   <chr>                <dbl> <chr>
 1 SEIKO               18259. 2015 
 2 DKNY                16142. 2015 
 3 PRESTO              10981. 2015 
 4 Calvin Klein         8508. 2015 
 5 ahilya               6285. 2015 
 6 Teakwood Leathers   20832. 2020 
 7 WHITE FIRE          17104. 2020 
 8 Masaba              16952. 2020 
 9 DRESSTIVE           16558. 2020 
10 KARAGIRI            15166. 2020 
# Arranging the dataframe by AvgPrice in descending order

top_brands_combined <- top_brands_combined %>%
  mutate(Brand = factor(Brand, levels = unique(Brand[order(-AvgPrice)])))

# Creating the plot and also displaying top 5 expensive brands from 2015 and 2020

df_2015 <- top_brands_combined %>% filter(Year == "2015")
df_2020 <- top_brands_combined %>% filter(Year == "2020")

print("Top 5 Expensive brands 2015:") 
[1] "Top 5 Expensive brands 2015:"
df_2015
# A tibble: 5 × 3
  Brand        AvgPrice Year 
  <fct>           <dbl> <chr>
1 SEIKO          18259. 2015 
2 DKNY           16142. 2015 
3 PRESTO         10981. 2015 
4 Calvin Klein    8508. 2015 
5 ahilya          6285. 2015 
print("Top 5 Expensive brands 2020:")
[1] "Top 5 Expensive brands 2020:"
df_2020
# A tibble: 5 × 3
  Brand             AvgPrice Year 
  <fct>                <dbl> <chr>
1 Teakwood Leathers   20832. 2020 
2 WHITE FIRE          17104. 2020 
3 Masaba              16952. 2020 
4 DRESSTIVE           16558. 2020 
5 KARAGIRI            15166. 2020 
# Creating the bar chart for 2015
p1 <- plot_ly(data = df_2015, x = ~Brand, y = ~AvgPrice, type = 'bar', name = '2015',
              marker = list(color = 'coral')) %>%
  layout(yaxis = list(title = 'Average Price 2015 (INR)'))

# Creating the bar chart for 2020
p2 <- plot_ly(data = df_2020, x = ~Brand, y = ~AvgPrice, type = 'bar', name = '2020',
              marker = list(color = 'seagreen')) %>%
  layout(yaxis = list(title = 'Average Price 2020 (INR)'),
         xaxis = list(title = 'Brand'))

# Combining the two plots into one with two subplots
# Note: shareX = TRUE means that all subplots will share the same x-axis.
subplot(p1, p2, nrows = 2, shareX = TRUE) %>%
  layout(title = 'Average Prices by Brand for 2015 and 2020')
# Adapted from: https://plotly.com/r/subplots/

This output shows a side-by-side comparison of average prices for various brands in two different years, 2015 and 2020. The orange bars represent the average prices in 2015, while the green bars represent the prices in 2020. The chart shows that, in general, brands are charging more for their products in 2020 than they did in 2015, prices have gone up in five years. The top brands have also changed like ‘Teakwood’, ‘White Fire’, however brands like ‘Seiko’ continue to dominate the expensive segment.

Brand location on world map using leaflet

# Loading the dataset that contains brands along with their founding country
location_of_brands <- read.csv("Location_of_brands.csv")

head(location_of_brands)
        Brands location
1     Priyaasi  Germany
2      AMMARZO  Denmark
3 John Players  Denmark
4         Nejo    India
5      RAREISM  Denmark
6       Mojama  Germany
# Finding lat and long using OSM geocoding
geocoded_data <- location_of_brands %>%
  geocode(country = location, method = 'osm') %>%
 # Adapted from: https://geocoder.readthedocs.io/providers/OpenStreetMap.html
  distinct(location, .keep_all = TRUE) %>%
  select(location,lat,long)
Passing 10 addresses to the Nominatim single address geocoder
Query completed in: 10.2 seconds
print(geocoded_data)
# A tibble: 10 × 3
   location   lat    long
   <chr>    <dbl>   <dbl>
 1 Germany   51.2   10.4 
 2 Denmark   55.7   10.3 
 3 India     22.4   78.7 
 4 Japan     36.6  139.  
 5 China     35.0  105.  
 6 Italy     42.6   12.7 
 7 Canada    61.1 -108.  
 8 France    46.6    1.89
 9 Spain     39.3   -4.84
10 USA       39.8 -100.  
# Joining the lat and long with the brands with location dataset
brand_location_geocoded <- inner_join(location_of_brands, geocoded_data, by = "location", relationship = "many-to-many") %>%
  select (Brands,location,lat,long)

head(brand_location_geocoded)
        Brands location      lat     long
1     Priyaasi  Germany 51.16382 10.44783
2      AMMARZO  Denmark 55.67025 10.33333
3 John Players  Denmark 55.67025 10.33333
4         Nejo    India 22.35111 78.66774
5      RAREISM  Denmark 55.67025 10.33333
6       Mojama  Germany 51.16382 10.44783
# Aggregating the data from detailed dataset 2 at brand level
aggregated_brands <- Product_Detailed_2 %>%
  group_by(BrandName) %>%
  summarise(
    AvgPrice = mean(`OriginalPrice (in Rs)`, na.rm = TRUE),
    ProductCount = n()
  ) %>%
  ungroup() %>%
  select(Brands = BrandName, AvgPrice, ProductCount)


# Using the Brand aggregated dataset and brand geocoded dataset to create a list of top 3 brands for each country and product count for those brands

top_brands_by_country <- aggregated_brands %>%
  inner_join(brand_location_geocoded, by = "Brands") %>%
  group_by(location) %>%
  arrange(desc(AvgPrice)) %>%
  mutate(Rank = row_number()) %>%
  filter(Rank <= 3) %>%
  summarise(
    TopBrands = paste(Brands, "(", round(AvgPrice, 2), "INR - ", ProductCount, "products)", collapse = "<br>")
  ) %>%
  ungroup()

# Joining back with 'geocoded_data' to get latitude and longitude
country_info <- geocoded_data %>%
  distinct(location, .keep_all = TRUE) %>%
  inner_join(top_brands_by_country, by = "location")

print(country_info)
# A tibble: 10 × 4
   location   lat    long TopBrands                                             
   <chr>    <dbl>   <dbl> <chr>                                                 
 1 Germany   51.2   10.4  Mameraa ( 12956.3 INR -  61 products)<br>NAKKASHI ( 1…
 2 Denmark   55.7   10.3  Teakwood Leathers ( 20832.33 INR -  12 products)<br>e…
 3 India     22.4   78.7  SHUBHKALA ( 14675 INR -  124 products)<br>KISAH ( 669…
 4 Japan     36.6  139.   WHITE FIRE ( 17103.9 INR -  77 products)<br>VENISA ( …
 5 China     35.0  105.   SABOO COLLECTIONS ( 13501.67 INR -  6 products)<br>BE…
 6 Italy     42.6   12.7  DRESSTIVE ( 16557.53 INR -  93 products)<br>THE AAB S…
 7 Canada    61.1 -108.   KARAGIRI ( 15166.38 INR -  29 products)<br>Fibre Worl…
 8 France    46.6    1.89 Aryavart ( 11027.76 INR -  49 products)<br>Binnis War…
 9 Spain     39.3   -4.84 Justanned ( 13926.92 INR -  49 products)<br>BEAVER ( …
10 USA       39.8 -100.   PS PRET BY PAYAL SINGHAL ( 13197.22 INR -  18 product…
# Creating the world map using leaflet
leaflet(country_info) %>%
  addProviderTiles(providers$CartoDB.Voyager) %>%
  # Adapted from: https://www.kaggle.com/code/devzohaib/interactive-maps-with-leaflet-in-r
  addCircleMarkers(
    lng = ~long, lat = ~lat, radius = 8, color = "coral", fillOpacity = 0.8,
    popup = ~paste0("<strong>", location, "</strong><br><strong>Top 3 Brands:</strong><br>", TopBrands)
  ) %>%
  setView(lng = 20, lat = 0, zoom = 2)

Flowchart of analysis using mermaid

flowchart TB
    subgraph csv1 ["myntra_products_catalog.csv"]
    end
    
    subgraph csv2 ["Product_Detailed_2.csv"]
    end
    
    subgraph csv3 ["Product_Reviews_3.csv"]
    end
    
    subgraph csv4 ["Location_of_brands.csv"]
    end
    
    csv1 -->|Read| preprocess1[Preprocessing]
    csv2 -->|Read| preprocess2[Preprocessing]
    csv3 -->|Read| preprocess3[Preprocessing]
    csv4 -->|Geocode| preprocess4[Preprocessing]
    
    preprocess1 --> analysis1.1[Expensive and Budget-Friendly Brands Analysis]
    preprocess1 --> analysis1.2[Gender wise product distribution]
    preprocess1 --> analysis1.3[Color wise product distribution]
    
    preprocess1 --> analysis2[Average price changes over time]
    preprocess2 --> analysis2[Average price changes over time]
    
    preprocess2 --> analysis3[Sentiment and Review Analysis]
    preprocess3 --> analysis3[Sentiment and Review Analysis]
    
    analysis2 --> analysis3.1[Wordcloud of top category]
    
    preprocess2 --> geo[Top 3 Brands by location on world map]
    preprocess4 --> geo[Top 3 Brands by location on world map]